This dataset contains the spirits purchase information of Iowa Class “E” liquor licensees by product and date of purchase from January 1, 2012 to 2017. The dataset can be used to analyze total spirits sales in Iowa of individual products at the store level. Link is here

The content that we will cover in this analysis:
Note: The original data is cut from big dataset from Kaggle
#need to run this code if you need to install cufflinks
!pip install cufflinks
#need to run this command if you need to install chart_studio
!pip install chart_studio
import pandas as pd
import dask.dataframe as dd
from dask import compute
import dask
import plotly.graph_objects as go
import chart_studio.plotly as py
import cufflinks as cf
from plotly.offline import iplot, init_notebook_mode
from plotly.offline import iplot
from matplotlib import pyplot as plt
import plotly.express as px
import numpy as np
import csv
df_full = dd.read_csv("liquor.trimmed.csv",engine="python",
error_bad_lines=False,dtype={
'Bottles Sold': 'float64',
'County Number': 'object',
'Item Number': 'object',
'Pack': 'float64',
'Store Number': 'object',
'Unnamed: 0': 'object',
'Vendor Number': 'object',
'Bottle Volume (ml)': 'float64',
'Zip Code': 'object'})
df_full.compute().shape
There are 2000942 rows and 24 columns in total
df_pandas = df_full.compute()
df_dask = dd.from_pandas(df_pandas, npartitions = 8)
df_dask.columns
df_dask.isnull().sum().compute()
df_dask = df_dask.dropna()
df_dask.compute().shape
df_dask['State Bottle Cost'] = df_dask['State Bottle Cost'].str.replace('$', '')
df_dask['State Bottle Retail'] = df_dask['State Bottle Retail'].str.replace('$', '')
df_dask['Sale (Dollars)'] = df_dask['Sale (Dollars)'].str.replace('$', '')
df_dask = df_dask.astype({'Store Number': 'int64', 'County Number': 'int64',
'Vendor Number': 'int64', 'Pack': 'int64', 'Bottles Sold': 'int64',
'State Bottle Cost': 'float64', 'State Bottle Retail': 'float64',
'Sale (Dollars)': 'float64' })
df_dask['Actual Sales(Dollar)'] = df_dask['State Bottle Retail']*df_dask['Bottles Sold']
df_dask.head(5)
The following part is analysed by Nhu Nguyen
city_volume = df_dask.groupby('City')['Volume Sold (Liters)'].sum().compute().sort_values(ascending = False)
top_volume = city_volume.head(20)
fig = px.bar(top_volume, title = 'Top 20 liquor consumption cities in Iowa state',
color_discrete_sequence = ['forestgreen'])
fig.update_layout(yaxis_title = 'Volume Sold (Liters)', xaxis_title = 'Iowa State Cities')
fig.show()
county_volume = df_dask.groupby('County')['Volume Sold (Liters)'].sum().compute().sort_values(ascending = False)
county_volume = county_volume.head(20)
fig = px.bar(county_volume, title = 'Top 20 liquor consumption counties in Iowa state')
fig.update_layout(yaxis_title = 'Volume Sold (Liters)', xaxis_title = 'Iowa State Counties')
fig.show()
Polk is the most populous county of Iowa state when it's hosting over 14% of the state's presidents. The liquor consumption of this county is kind double the second one (600k in comparison to 300k).
item_dcpt = df_dask.groupby(['City', 'Item Description']).sum().compute()
item_dcpt.reset_index(inplace = True)
item_sold_dm= item_dcpt[item_dcpt['City'] == 'Des Moines'].sort_values(by = 'Volume Sold (Liters)',ascending = False)
item_sold_cr= item_dcpt[item_dcpt['City'] == 'Cedar Rapids'].sort_values(by = 'Volume Sold (Liters)',ascending = False)
item_sold_d= item_dcpt[item_dcpt['City'] == 'Davenport'].sort_values(by = 'Volume Sold (Liters)',ascending = False)
item_sold_wdm= item_dcpt[item_dcpt['City'] == 'West Des Moines'].sort_values(by = 'Volume Sold (Liters)',ascending = False)
item_sold_cb= item_dcpt[item_dcpt['City'] == 'Council Bluffs'].sort_values(by = 'Volume Sold (Liters)',ascending = False)
item_sold = pd.concat([item_sold_dm.head(5), item_sold_cr.head(5),item_sold_d.head(5),
item_sold_wdm.head(5),item_sold_cb.head(5)])
px.bar(item_sold, x = 'City', y = 'Volume Sold (Liters)', color = 'Item Description')
There are 11 popular liquors in 5 cities:

store_locate = df_dask.groupby(['Store Name','Store Location']).sum().compute()
store_locate.reset_index(inplace = True)
store_locate_splt = store_locate['Store Location'].str.split('\\n', n = 3, expand = True)
store_locate['Longitute']= store_locate_splt[2].str.split(',', expand = True)[0].str.replace('(', '')
store_locate['Latitute'] = store_locate_splt[2].str.split(',', expand = True)[1].str.replace(')', '')
store_locate.dropna(inplace = True)
store_locate
store_locate["Text"] = store_locate['Store Name'] + ":" + store_locate['Actual Sales(Dollar)'].astype(str)
fig = go.Figure(data=go.Scattergeo(
locationmode = 'USA-states',
lon = store_locate['Latitute'],
lat = store_locate['Longitute'],
text = store_locate['Text'],
mode = 'markers', marker_color = store_locate['Actual Sales(Dollar)'],
))
fig.update_layout(
title = 'Actual sales of each store in Iowa state',
geo_scope = 'usa'
)
fig.show()
The following part is analysed by Radim Musalek
Liquor in the U.S. can be sold in various sizes and we'd expect the 750ml and 1000ml to be among the most popular ones. We decided to analyse the dataset to find out which bottle sizes were the most sold in Iowa over the given period of time.
bottle_volume_pop_full = df_dask.groupby("Bottle Volume (ml)").sum().reset_index()
fig_pop = px.bar(bottle_volume_pop_full.compute().sort_values("Bottle Volume (ml)"),
x="Bottle Volume (ml)", y="Bottles Sold")
fig_pop.update_layout(title = "Favourite bottle sizes",
yaxis_title = "Bottles Sold",
xaxis =dict(title="Bottle Volume (ml)",
type="category"))
As we can see from the chart above the 750ml was the most sold bottle size as most stronger liquor is being sold in this volume. This was followed by 1750ml, often strong liquor for events is being sold in this size, then 375ml, i.e. half of 750ml size, and then also by the originally expected 1000ml. The sales volumes of the other sizes were mostly insignicant.
Two theories on the use of 750ml:
There was also one surprising bottle size, 31.5 litre bottles, which we decided to investigate further to find out what kind of liquor is being sold in such bottles.
bot_31500 = df_dask[df_dask["Bottle Volume (ml)"] == 31500]
fig_31500 = px.bar(bot_31500.compute(),
x="Item Description", y="Bottles Sold", color="Item Description")
fig_31500.update_layout(title = "31.5 litre selection",
yaxis_title = "Bottles Sold")
There were 3 kinds of liquor sold in the 31.5litre bottles:
However, as we can see from the product descriptions the sold products were actually only packages of 18 1.75 litre bottles so nothing unusual.
There hundreds of shops in the Iowa state so we would like to find the one with the largest and the smallest selection. This could help us for example with recommendations for buyers where to find a broad selection and which shops are rather for specialised purchases.
selection = df_dask.compute().pivot_table(index="Category Name", columns="Store Name", values="State Bottle Retail", aggfunc=[lambda x: True if x.any() != np.nan else False])
selection.columns = selection.columns.get_level_values(1)
selection_tot = {}
for shop in selection.columns:
sel = selection[shop].sum()
selection_tot[shop] = sel
sel_pd = pd.DataFrame.from_dict(selection_tot, orient="index", columns=["Product selection"])
sel_top20 = sel_pd.nlargest(20, "Product selection")
sel_low20 = sel_pd.nsmallest(20, "Product selection")
fig_top20 = px.bar(sel_top20, y="Product selection", x=sel_top20.index)
fig_top20 = fig_top20.update_layout(title = "Top20 shops in selection variability",
yaxis_title = "Product groups in selection",
xaxis_title="Shop name")
fig_low20 = px.bar(sel_low20, y="Product selection", x=sel_low20.index)
fig_low20 = fig_low20.update_layout(title = "Lowest 20 shops in selection variability",
yaxis_title = "Product groups in selection",
xaxis_title="Shop name")
fig_top20.show()
fig_low20.show()
We'd expect higher differences in the largest shops offerings but since the products are grouped into 57 categories in our dataset the level of the most sold products per shop in the largest selection is pretty flat, ranging from 56 products for the top1 shop to 53 products for the top20 shop.
There're two groups in the top20 list:
In the section with the lowest variation of products we can find mostly local distileries offerring their own products which would be surely in contrast with the Finnish market (if investigated) where distileries aren't allowed to sell directly to retail customers.
This following part is analysedby Dmitry Buiskikh
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from plotly.subplots import make_subplots
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct']
weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri","Sat" ]
weeks = [str(i) for i in range(1,45)]
df_dask["Date"].dtype
df_dask["Date"] = dd.to_datetime(df_dask["Date"],dayfirst=False)
df_dask["Date"].dtype
dfSorted = df_dask.compute().sort_values(by="Date").reset_index()
type(dfSorted)
dfSorted.head()
DFD = dfSorted.set_index("Date")
DFD_MI = dfSorted.set_index([DFD.index.year, "Date"])
DFD_MI.head()
The DataSet starts from the middle of December 2016. In order to make the analyze more descriptive, we decided to concentrate only on the year of 2017. In fact, on 10 months of 2017, as the DataSet ends on 2017-10-31.
DFD2017 = DFD_MI.loc[2017]
type(DFD2017)
DFD2017.shape
10 months of 2017 have 1.87 mln of rows. Enough for the analyze to be reliable.
DFD2017.tail()
DFD2017Month = DFD2017.resample("M").sum()
DFD2017Month.head(2)
fig = px.line(x=months, y=DFD2017Month["Actual Sales(Dollar)"])
fig.update_traces(mode='lines+markers')
fig.update_layout(
title='Total monthly sales, $',
yaxis_title='$',
yaxis_range =[0, 8000000],
xaxis_title=None,
showlegend = False,
autosize=True, height = 500,
width = 1000
)
fig.show()
DFD2017Week = DFD2017.resample("W").sum()
fig = px.line(x=weeks[:-1], y=DFD2017Week["Actual Sales(Dollar)"][:-1])
fig.update_traces(mode='lines+markers')
fig.update_layout(
title='Total weekly sales, $',
yaxis_title='$',
yaxis_range =[0, 2000000],
xaxis_title=None,
showlegend = False,
autosize=True, height = 500,
width = 1000
)
fig.show()
DFD2017["weekday"] = DFD2017.index.day_name().str[:3]
DFD2017["weekday"] = DFD2017.index.day_name().str[:3]
paivat = []
for day in weekdays[:-1]:
paivat.append(int(DFD2017[DFD2017["weekday"]==day]["Actual Sales(Dollar)"].sum().round(0)))
fig = px.line(x=weekdays[:-1], y=paivat)
fig.update_traces(mode='lines+markers')
fig.update_layout(
title='Sales by day of week, $',
yaxis_title='$',
yaxis_range =[0, 16000000],
xaxis_title=None,
showlegend = False,
autosize=True, height = 500,
width = 700
)
fig.show()
def pivotTable(kolonna, parameter):
A = pd.pivot_table(DFD2017, index = "Category Name",
columns = kolonna,
values = [parameter],
aggfunc = np.sum)
return A
def plotting(period, A, B, C, choice, podpis):
fig = go.Figure()
specs = [[{'type':'xy'}], [{'type':'xy'}], [{'type':'xy'}]]
fig = make_subplots(rows=3, cols=1, specs=specs,
subplot_titles=
("Bottles Sold","Volume Sold", "Sales")
)
fig.add_trace(go.Bar(
x=period,
y=A.loc[choice],
name=choice + " (Bottles Sold)",
marker_color="red"
),1,1)
fig.add_trace(go.Bar(
x=period,
y=B.loc[choice],
name= choice + " (Volume Sold (Liters))",
marker_color="blue"
),2,1)
fig.add_trace(go.Bar(
x=period,
y=C.loc[choice],
marker_color="green"
),3,1)
fig.update_yaxes(title_text="Bottles", row=1, col=1)
fig.update_yaxes(title_text="Liters", row=2, col=1)
fig.update_yaxes(title_text="$", row=3, col=1)
fig.update_layout(height=800, width=800,
showlegend=False,
title={
'text': choice + " ("+ podpis +")",
'y':0.98,
'x':0.5},)
fig.show()
def byBeverage(choice):
A = pivotTable(DFD2017.index.weekday, "Bottles Sold")
B = pivotTable(DFD2017.index.weekday, "Volume Sold (Liters)")
C = pivotTable(DFD2017.index.weekday, "Actual Sales(Dollar)")
plotting(weekdays, A,B,C, choice, "Days of week")
A = pivotTable(DFD2017.index.week, "Bottles Sold")
B = pivotTable(DFD2017.index.week, "Volume Sold (Liters)")
C = pivotTable(DFD2017.index.week, "Actual Sales(Dollar)")
plotting(weeks, A,B,C, choice, "Weekly")
A = pivotTable(DFD2017.index.month, "Bottles Sold")
B = pivotTable(DFD2017.index.month, "Volume Sold (Liters)")
C = pivotTable(DFD2017.index.month, "Actual Sales(Dollar)")
plotting(months, A,B,C, choice, "Monthly")
interact(byBeverage,choice=DFD2017["Category Name"])
DFD2017["Category Name"].unique()
byBeverage("Gold Rum")
sample = df_dask.sample(frac = 0.002)
sample['Store Name'].compute().nunique()
sample['Category Name'].compute().nunique()
a=total_maximum_sales_per_store=sample.groupby(['Store Name'])['Actual Sales(Dollar)'].sum().nlargest(5)
fig = go.Figure()
b=a.index.compute()
y=a.compute()
def SetColor(y):
if(y >= 3000):
return "lime"
elif(y >= 1500):
return "purple"
elif(y >= 1000):
return "coral"
fig.add_trace((go.Bar(x=b,y=a,marker=dict(color = list(map(SetColor, y))))))
fig.update_layout(
title={
'text': "top 5 seller"
,
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()
c=total_minimum_sales_per_store=sample.groupby(['Store Name'])['Actual Sales(Dollar)'].sum().nsmallest(5)
import plotly.graph_objects as go
fig = go.Figure()
d=c.index.compute()
y=c.compute()
def SetColor(y):
if(y <= 2):
return "red"
elif(y <= 3):
return "blue"
elif(y <= 5):
return "green"
fig = go.Figure((go.Bar(x=d, y=c,marker=dict(color = list(map(SetColor, y))))))
fig.update_layout(
title={
'text': "flop 5 seller",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()
e=total_maximum_sales_per_category=sample.groupby(['Category Name'])['Bottles Sold'].sum().nlargest(5)
import plotly.graph_objects as go
fig = go.Figure()
f=e.index.compute()
y=e.compute()
def SetColor(y):
if(y >= 1000):
return "lime"
elif(y >= 600):
return "purple"
elif(y >= 400):
return "coral"
fig.add_trace((go.Bar(x=f,y=e,
marker=dict(color = list(map(SetColor, y))))))
fig.update_layout(
title={
'text': "top 5 best selling alco"
,
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()
g=total_minimum_sales_per_category=sample.groupby(['Category Name'])['Bottles Sold'].sum().nsmallest(5)
import plotly.graph_objects as go
fig = go.Figure()
h=g.index.compute()
y=g.compute()
def SetColor(y):
if(y <= 1):
return "red"
elif(y <= 2):
return "blue"
elif(y <= 5):
return "green"
fig = go.Figure((go.Bar(x=h, y=g,marker=dict(color = list(map(SetColor, y))))))
fig.update_layout(
title={
'text': "flop 5 least selling alco",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()
df_dask['Store Name'].compute().nunique()
df_dask.groupby(['Store Name'])['Actual Sales(Dollar)'].sum().nlargest(20).compute()
df_dask.groupby(['Store Name'])['Actual Sales(Dollar)'].sum().nsmallest(20).compute()
i=total_maximum_sales_per_store=df_dask.groupby(['Store Name'])['Actual Sales(Dollar)'].sum().nlargest(5)
import plotly.graph_objects as go
fig = go.Figure()
j=i.index.compute()
y=i.compute()
def SetColor(y):
if(y >= 800000):
return "lime"
elif(y >= 4000000):
return "purple"
elif(y >= 200000):
return "coral"
fig.add_trace((go.Bar(x=j,y=i,marker=dict(color = list(map(SetColor, y))))))
fig.update_layout(
title={
'text': "top 5 seller"
,
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()
k=total_minimum_sales_per_store=df_dask.groupby(['Store Name'])['Actual Sales(Dollar)'].sum().nsmallest(5)
import plotly.graph_objects as go
fig = go.Figure()
l=k.index.compute()
y=k.compute()
def SetColor(y):
if(y >= 250):
return "green"
elif(y >= 150):
return "blue"
elif(y >= 20):
return "red"
fig = go.Figure((go.Bar(x=l, y=k,marker=dict(color = list(map(SetColor, y))))))
fig.update_layout(
title={
'text': "flop 5 seller",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()
df_dask['Category Name'].compute().nunique()
df_dask.groupby(['Category Name'])['Bottles Sold'].sum().nlargest(20).compute()
df_dask.groupby(['Category Name'])['Bottles Sold'].sum().nsmallest(20).compute()
m=total_maximum_sales_per_category=df_dask.groupby(['Category Name'])['Bottles Sold'].sum().nlargest(5)
import plotly.graph_objects as go
fig = go.Figure()
n=m.index.compute()
y=m.compute()
def SetColor(y):
if(y >= 500000):
return "lime"
elif(y >= 200000):
return "purple"
elif(y >= 400):
return "coral"
fig.add_trace((go.Bar(x=n,y=m,
marker=dict(color = list(map(SetColor, y))))))
fig.update_layout(
title={
'text': "top 5 best selling alco"
,
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()
o=total_minimum_sales_per_category=df_dask.groupby(['Category Name'])['Bottles Sold'].sum().nsmallest(5)
import plotly.graph_objects as go
fig = go.Figure()
p=o.index.compute()
y=o.compute()
def SetColor(y):
if(y <= 50):
return "red"
elif(y <= 300):
return "blue"
elif(y <= 450):
return "green"
fig = go.Figure((go.Bar(x=p, y=o,marker=dict(color = list(map(SetColor, y))))))
fig.update_layout(
title={
'text': "flop 5 least selling alco",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()